3. Train-Predict-XGBoost


In [46]:
import time
import os
import pandas as pd

project_name = 'Dog_Breed_Identification'
step_name = 'Train-Predict-XGBoost'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)

cwd = os.getcwd()
log_path = os.path.join(cwd, 'log')
model_path = os.path.join(cwd, 'model')
output_path = os.path.join(cwd, 'output')
print('log_path: \t' + log_path)
print('model_path: \t' + model_path)
print('output_path: \t' + output_path)

run_name: Dog_Breed_Identification_Train-Predict-XGBoost_20171101_221638
log_path: 	E:\Udacity\MachineLearning(Advanced)\p6_graduation_project\log
model_path: 	E:\Udacity\MachineLearning(Advanced)\p6_graduation_project\model
output_path: 	E:\Udacity\MachineLearning(Advanced)\p6_graduation_project\output

In [27]:
df = pd.read_csv(os.path.join(cwd, 'input', 'labels.csv'))
print('lables amount: %d' %len(df))

lables amount: 10222
id breed
0 000bec180eb18c7604dcecc8fe0dba07 boston_bull
1 001513dfcb2ffafc82cccf4d8bbaba97 dingo
2 001cdf01b096e06d78e9e5112d419397 pekinese
3 00214f311d5d2247d5dfe4fe24b2303d bluetick
4 0021f9ceb3235effd7fcde7f7538ed62 golden_retriever

In [28]:
import h5py
import numpy as np
from sklearn.utils import shuffle

x_train = []
y_train = {}
x_val = []
y_val = {}
x_test = []

cwd = os.getcwd()
feature_cgg16 = os.path.join(cwd, 'model', 'feature_VGG16_{}.h5'.format(20171026))
feature_cgg19 = os.path.join(cwd, 'model', 'feature_VGG19_{}.h5'.format(20171026))
feature_resnet50 = os.path.join(cwd, 'model', 'feature_ResNet50_{}.h5'.format(20171026))
feature_xception = os.path.join(cwd, 'model', 'feature_Xception_{}.h5'.format(20171026))
feature_inception = os.path.join(cwd, 'model', 'feature_InceptionV3_{}.h5'.format(20171026))
# feature_inceptionResNetV2 = os.path.join(cwd, 'model', 'feature_InceptionResNetV2_{}.h5'.format(20171028))
for filename in [feature_cgg16, feature_cgg19, feature_resnet50, feature_xception, feature_inception]:
    with h5py.File(filename, 'r') as h:
        y_train = np.array(h['train_labels'])

# print(x_train[0].shape)
x_train = np.concatenate(x_train, axis=-1)
# y_train = np.concatenate(y_train, axis=0)
# x_val = np.concatenate(x_val, axis=-1)
# y_val = np.concatenate(y_val, axis=0)
x_test = np.concatenate(x_test, axis=-1)

# print(x_val.shape)
# print(len(y_val))

(10222, 7168)
(10357, 7168)

In [29]:
from sklearn.utils import shuffle
(x_train, y_train) = shuffle(x_train, y_train)

In [30]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.05, random_state=2017)

(9710, 7168)
(512, 7168)

In [31]:
from keras.utils.np_utils import to_categorical

# y_train = to_categorical(y_train)
# y_val = to_categorical(y_val)


Build Model

In [32]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [43]:
xg_train = xgb.DMatrix(x_train, label=y_train)
xg_val = xgb.DMatrix(x_val, label=y_val)
xg_test = xgb.DMatrix(x_test)
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 50
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 120

watchlist = [(xg_train, 'train'), (xg_val, 'val')]
num_round = 5
bst = xgb.train(param, xg_train, num_round, watchlist)

[0]	train-merror:0.252935	val-merror:0.361328
[1]	train-merror:0.14861	val-merror:0.298828
[2]	train-merror:0.106076	val-merror:0.257812
[3]	train-merror:0.075695	val-merror:0.242188
[4]	train-merror:0.054892	val-merror:0.226562
Wall time: 14min 47s

In [44]:
model_name = run_name + '.bin'

In [45]:
bst0 = xgb.Booster({'nthread': 4})  # init model
bst0.load_model(model_name)  # load data

In [49]:
y_pred = bst0.predict(xg_val)

[  0.   5.  14.  94.   2.]

In [50]:
# do the same thing again, but output probabilities
param['objective'] = 'multi:softprob'
bst1 = xgb.train(param, xg_train, num_round, watchlist)
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
# pred_prob = bst0.predict(xg_val).reshape(test_Y.shape[0], 6)
# pred_label = np.argmax(pred_prob, axis=1)
# error_rate = np.sum(pred_label != test_Y) / test_Y.shape[0]
# print('Test error using softprob = {}'.format(error_rate))

[0]	train-merror:0.252935	val-merror:0.361328
[1]	train-merror:0.14861	val-merror:0.298828
[2]	train-merror:0.106076	val-merror:0.257812
[3]	train-merror:0.075695	val-merror:0.242188
[4]	train-merror:0.054892	val-merror:0.226562

In [52]:
model_name = run_name + '_prob.bin'

In [53]:
bst0 = xgb.Booster({'nthread': 4})  # init model
bst0.load_model(model_name)  # load data

In [51]:
y_pred = bst1.predict(xg_val)

(512, 120)
[[ 0.2974295   0.00490525  0.00490315  0.00490537  0.00500645  0.00490574
   0.0049357   0.00490251  0.00490329  0.00490563  0.00490444  0.00490323
   0.00494976  0.00490511  0.00490492  0.00490526  0.0049048   0.00496223
   0.00490462  0.00490526  0.00490454  0.00490512  0.00577949  0.00490558
   0.00490477  0.00490458  0.0071401   0.00490558  0.00490444  0.00490545
   0.00502031  0.00490284  0.0049055   0.00490569  0.00490459  0.00563949
   0.00490418  0.00490501  0.00555472  0.00490506  0.00490551  0.00500069
   0.00490202  0.00490547  0.00490484  0.00490522  0.00490505  0.00528597
   0.00490526  0.00490467  0.00490415  0.00492774  0.00490459  0.00490551
   0.00662434  0.00490357  0.00490445  0.00490508  0.00589457  0.00490437
   0.00490517  0.00490458  0.00490286  0.00490502  0.04553157  0.00490419
   0.00490526  0.00490534  0.00490429  0.00490533  0.0049056   0.00490531
   0.00490403  0.0049024   0.00490438  0.00490484  0.00490572  0.00621409
   0.06089636  0.00490542  0.00490123  0.00490466  0.00490556  0.00490487
   0.00490427  0.0049045   0.0049055   0.00490346  0.00490342  0.00490521
   0.00490474  0.00490423  0.00490513  0.00490549  0.00528533  0.00490312
   0.01256359  0.00490426  0.00490406  0.00490497  0.00569648  0.00490533
   0.00490544  0.0049052   0.00490549  0.00496761  0.00494197  0.00490408
   0.00490494  0.00490511  0.00711655  0.00567312  0.00490527  0.00490543
   0.00490357  0.00528359  0.00536904  0.0049822   0.00490476  0.00500313]
 [ 0.00664251  0.00664275  0.0066399   0.00664292  0.00715593  0.13026524
   0.00664294  0.00663904  0.00664009  0.00664326  0.00664165  0.00975148
   0.00681564  0.00664256  0.007697    0.00664276  0.00664214  0.01204315
   0.00664189  0.00680158  0.00664179  0.00664257  0.00664207  0.00664319
   0.0066421   0.00664184  0.00664245  0.00933286  0.00664166  0.00854256
   0.00663859  0.00663948  0.00664309  0.02280047  0.00664186  0.00673126
   0.0066413   0.00664242  0.00735943  0.00664249  0.00664311  0.00664071
   0.0107274   0.00664305  0.0066422   0.0066427   0.01167289  0.00664258
   0.00664276  0.00751382  0.00664126  0.00664255  0.00664185  0.00727148
   0.00664281  0.00664047  0.00664166  0.00664252  0.00663977  0.00664155
   0.00664264  0.00664184  0.00663951  0.00963491  0.00664108  0.00664132
   0.00664277  0.00703968  0.00670006  0.00667426  0.00664322  0.00664283
   0.0066411   0.00663888  0.00664157  0.02073119  0.00664338  0.00667883
   0.01736585  0.00715883  0.00663731  0.00664195  0.00664317  0.00664223
   0.00675325  0.00664173  0.00664308  0.00664033  0.00664027  0.0066427
   0.00670577  0.00664136  0.00664258  0.00769393  0.00664221  0.00663986
   0.00664279  0.00664141  0.00664113  0.00664237  0.00664212  0.00664286
   0.00679436  0.00664268  0.00664307  0.00664133  0.00664307  0.00664115
   0.01184149  0.00664256  0.0066424   0.00664157  0.00664277  0.00664299
   0.00787519  0.00664094  0.00664299  0.00664273  0.00668845  0.00669705]
 [ 0.0051162   0.00511637  0.00511418  0.0051165   0.00511602  0.00511689
   0.00511652  0.00511352  0.05320547  0.00511677  0.00511553  0.00511426
   0.02889798  0.00511623  0.30715349  0.00511638  0.00511591  0.00511535
   0.00511571  0.00511639  0.00511564  0.00511624  0.00511585  0.00511672
   0.00511587  0.005234    0.00511615  0.00511672  0.00511553  0.00511659
   0.00511317  0.00741165  0.00511664  0.00511683  0.00511569  0.00511491
   0.00511526  0.00511613  0.00517258  0.00511617  0.00511665  0.00556918
   0.00511301  0.00511661  0.00511595  0.00511634  0.00640045  0.00511625
   0.00511638  0.00511577  0.00511523  0.00511622  0.00511569  0.00511665
   0.00511643  0.00511462  0.00511554  0.0051162   0.00511408  0.00511545
   0.00511629  0.00511567  0.00511388  0.00511613  0.00511509  0.00511527
   0.00511639  0.00511647  0.00780102  0.00511646  0.00527611  0.00511644
   0.00578488  0.0051134   0.00511547  0.00511595  0.00511686  0.00511676
   0.00511666  0.00511656  0.00511218  0.00511576  0.0051167   0.00533481
   0.00511535  0.00511559  0.00511663  0.00511451  0.00511447  0.00525322
   0.00511584  0.00511531  0.00511625  0.00627326  0.00511596  0.00511415
   0.0051164   0.00511535  0.00511513  0.00511609  0.00522371  0.00511646
   0.00511657  0.00511632  0.00551276  0.00511529  0.00571635  0.00679448
   0.00521185  0.00519542  0.00511611  0.00511547  0.00511639  0.00511656
   0.00511463  0.00511498  0.00511656  0.00511636  0.00511586  0.00511636]
 [ 0.00502803  0.00499871  0.00499657  0.00499884  0.00499836  0.00499921
   0.00499885  0.00499592  0.00499671  0.0049991   0.00499788  0.00499665
   0.00499837  0.00499857  0.00499838  0.00499872  0.00504145  0.00499771
   0.00499807  0.00499872  0.00499799  0.00499857  0.0049982   0.00499904
   0.00499822  0.00499803  0.00499849  0.00508885  0.00499789  0.00499892
   0.02519783  0.00499625  0.00499897  0.0052715   0.00499804  0.00499727
   0.00499762  0.00499847  0.00499857  0.00538681  0.00499898  0.00499717
   0.00499542  0.00499894  0.0049983   0.00499868  0.00499851  0.00499859
   0.00499872  0.00499812  0.00499759  0.01606635  0.00499804  0.00499898
   0.00499876  0.00545304  0.00499789  0.00565864  0.00499647  0.00499781
   0.00499863  0.00499803  0.00672605  0.00499847  0.00499746  0.00499763
   0.00499872  0.00499881  0.00504184  0.00499879  0.00499907  0.00499878
   0.00499747  0.00506755  0.00499782  0.00510018  0.0050289   0.00499909
   0.00499899  0.00499889  0.00499462  0.00499811  0.00499903  0.00499832
   0.00514893  0.00499795  0.00499896  0.00499689  0.00499685  0.00499867
   0.00499819  0.00499767  0.00499859  0.00499896  0.36800078  0.00614943
   0.00499874  0.0049977   0.00499749  0.00499843  0.00499824  0.00499879
   0.0049989   0.00499866  0.00499896  0.00566063  0.00499896  0.00499751
   0.00506728  0.00499857  0.00499845  0.00499782  0.00499873  0.00499889
   0.004997    0.00499735  0.00499889  0.0049987   0.00499821  0.00499869]
 [ 0.00524388  0.00524406  0.35276937  0.0052442   0.0052437   0.00524459
   0.00543621  0.00524114  0.00524197  0.00524447  0.0052432   0.0052419
   0.00524371  0.00524391  0.00524371  0.00524407  0.00524358  0.00524302
   0.00565103  0.00544742  0.00524331  0.00524392  0.00524353  0.00524441
   0.00524355  0.00524335  0.00532671  0.00524442  0.0052432   0.00524428
   0.00524078  0.00524149  0.00524433  0.00524453  0.00524336  0.00531893
   0.00529505  0.0062623   0.00524392  0.00524386  0.00524435  0.00524245
   0.00524061  0.0052443   0.00524363  0.01015866  0.00524385  0.00524394
   0.00524407  0.00524345  0.00529815  0.0052681   0.0056493   0.00524435
   0.00524412  0.00524227  0.00571694  0.00524388  0.0056489   0.00524312
   0.0145527   0.00530208  0.00524151  0.00524382  0.00524275  0.00524293
   0.00524408  0.00524417  0.00524303  0.00524415  0.00524444  0.00524413
   0.00524276  0.00524101  0.00524313  0.00524363  0.00524456  0.00524446
   0.00524435  0.00524425  0.00523977  0.00524344  0.0052444   0.00643719
   0.00524301  0.00524326  0.00533296  0.00524216  0.00524211  0.00524402
   0.00593787  0.00761592  0.00524393  0.00593885  0.00524364  0.00524179
   0.00524409  0.00524301  0.00524279  0.00524377  0.00524357  0.00524415
   0.00524427  0.00524401  0.00524432  0.00524295  0.00537048  0.00524281
   0.00524373  0.00524392  0.00524379  0.00524313  0.00524408  0.00524426
   0.00524227  0.00524264  0.00524426  0.00524405  0.00524354  0.00565101]]

In [ ]:

In [ ]:

In [ ]:
run_name0 = run_name + '_' + str(int(final_acc*10000)).zfill(4)

In [ ]:


In [ ]:
# Used to load model directly and skip train
# import os
# from keras.models import load_model
# cwd = os.getcwd()
# model = load_model(os.path.join(cwd, 'model', 'Dog_Breed_Identification_Train_20171024_155154.h5'))

In [ ]:
y_pred = model.predict(x_test, batch_size=128)

In [ ]:
# print(y_pred[:10])
# y_pred = np.clip(y_pred, 0.005, 0.995)
# print(y_pred[:10])

In [ ]:
files = os.listdir(os.path.join(cwd, 'input', 'data_test', 'test'))

In [ ]:
cwd = os.getcwd()
df = pd.read_csv(os.path.join(cwd, 'input', 'labels.csv'))
print('lables amount: %d' %len(df))

In [ ]:
n = len(df)
breed = set(df['breed'])
n_class = len(breed)
class_to_num = dict(zip(breed, range(n_class)))
num_to_class = dict(zip(range(n_class), breed))

In [ ]:
df2 = pd.read_csv('.\\input\\sample_submission.csv')
n_test = len(df2)

In [ ]:
for i in range(0, 120):
    df2.iloc[:,[i+1]] = y_pred[:,i]
if not os.path.exists(output_path):
pred_file = os.path.join(output_path, 'pred_' + run_name0 + '.csv')
df2.to_csv(pred_file, index=None)

In [ ]:

In [ ]:
print('Done !')

In [ ]: